DISCLAIMER: This dataset is owned and supplied by the Dutch datamining company Sentient Machine Research, and is based on real world business data. You are allowed to use this dataset and accompanying information for non commercial research and education purposes only.
El objetivo del análisis es poder predecir si un cliente comprará una póliza de seguros para una autocaravana.
Para ello, disponemos de un dataset de entrenamiento con 5822 registros de clientes, cada uno con 86 atributos, que se distribuyen en 43 socio-demográficos y 42 relacionados con la propiedad de productos. La variable nº 86, 'CARAVAN, nº of mobile home policies' es la variable objetivo, que recibe los siguientes valores:
No dispone de ninguna póliza de seguro de autocaravana.
Dispone de 1 póliza de autocaravana.
Por otra parte, disponemos de un dataset de test con 4000 registros.
Para poder llevar a cabo la predicción, debemos analizar las variables para identificar cuales pueden ser más favorables a la hora de realizarnos un perfil de cliente potencia.
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import scipy.stats as ss
from matplotlib import pyplot as plt
import warnings
import statistics as st
import openpyxl
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
Funciones que vamos a utilizar para el análisis exploratorio de los datos:
def dame_variables_categoricas(dataset=None):
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
def duplicate_columns(frame):
groups = frame.columns.to_series().groupby(frame.dtypes).groups
dups = []
for t, v in groups.items():
cs = frame[v].columns
vs = frame[v]
lcs = len(cs)
for i in range(lcs):
ia = vs.iloc[:,i].values
for j in range(i+1, lcs):
ja = vs.iloc[:,j].values
if np.array_equal(ia, ja):
dups.append(cs[i])
break
return dups
def plot_feature(df, col_name, isContinuous, target):
"""
Visualize a variable with and without faceting on the loan status.
- df dataframe
- col_name is the variable name in the dataframe
- full_name is the full variable name
- continuous is True if the variable is continuous, False otherwise
"""
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def plot_feature_mod(df, col_name, isContinuous, target):
for x in range(1,len(df_my_dictionary)):
if col_name == df_my_dictionary.Name.loc[x]:
descripcion = df_my_dictionary.Description.loc[x]
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(descripcion+', nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(descripcion + ' & '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target+' & '+descripcion)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
"""
Devuelve el porcentaje de valores que exceden del intervalo de confianza
:type series:
:param multiplier:
:return:
"""
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
#pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
# pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_percent_null_values_target(pd_loan, list_var_continuous, target):
pd_final = pd.DataFrame()
for i in list_var_continuous:
if pd_loan[i].isnull().sum()>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum()/pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def cramers_v(confusion_matrix):
"""
calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
confusion_matrix: tabla creada con pd.crosstab()
"""
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
def blaus_index(arr):
return 1 - sum((arr.value_counts() / len(arr)) ** 2)
Cargamos una serie de dataframes que nos permitiran interpretar mejor los datos del modelo de entrenamiento para poder realizar el análisis exploratorio.
df_my_dictionary = pd.read_csv("../data/raw/tic/my_dictionary.txt",sep=" ")
df_l0 = pd.read_csv("../data/raw/tic/L0.txt",sep=";")
df_l1 = pd.read_csv("../data/raw/tic/L1.txt",sep=";")
df_l2 = pd.read_csv("../data/raw/tic/L2.txt",sep=";")
df_l3 = pd.read_csv("../data/raw/tic/L3.txt",sep=";")
df_l4 = pd.read_csv("../data/raw/tic/L4.txt",sep=";")
Cargamos los datos de 'ticdata2000.txt' en el df_train que servirá para entrenar el modelo. Cargamos los datos de 'ticeval2000.txt' y 'tictgts2000' y los unimos en el df_test que servirá para testear el modelo.
# df_train:
df_train = pd.read_csv("../data/raw/tic/ticdata2000.txt", sep="\t", header=None)
df_train.columns = list(df_my_dictionary.Name)
# df_test:
df_ticeval2000 = pd.read_csv("../data/raw/tic/ticeval2000.txt", sep="\t", header=None)
df_tictgts2000= pd.read_csv("../data/raw/tic/tictgts2000.txt", sep="\t", header=None)
df_test = pd.concat([df_ticeval2000, df_tictgts2000], axis=1)
df_test.columns = df_train.columns = list(df_my_dictionary.Name)
A continuación analizamos los dataset para comprobar que la distribución es similar y se puede proceder con el análisis:
print('== Train\n', df_train['CARAVAN'].value_counts(normalize=True))
print('== Test\n', df_test['CARAVAN'].value_counts(normalize=True))
== Train 0 0.940227 1 0.059773 Name: CARAVAN, dtype: float64 == Test 0 0.9405 1 0.0595 Name: CARAVAN, dtype: float64
print(df_train.shape, df_train.drop_duplicates().shape)
(5822, 86) (5220, 86)
df_train.dtypes
MOSTYPE int64 MAANTHUI int64 MGEMOMV int64 MGEMLEEF int64 MOSHOOFD int64 MGODRK int64 MGODPR int64 MGODOV int64 MGODGE int64 MRELGE int64 MRELSA int64 MRELOV int64 MFALLEEN int64 MFGEKIND int64 MFWEKIND int64 MOPLHOOG int64 MOPLMIDD int64 MOPLLAAG int64 MBERHOOG int64 MBERZELF int64 MBERBOER int64 MBERMIDD int64 MBERARBG int64 MBERARBO int64 MSKA int64 MSKB1 int64 MSKB2 int64 MSKC int64 MSKD int64 MHHUUR int64 MHKOOP int64 MAUT1 int64 MAUT2 int64 MAUT0 int64 MZFONDS int64 MZPART int64 MINKM30 int64 MINK3045 int64 MINK4575 int64 MINK7512 int64 MINK123M int64 MINKGEM int64 MKOOPKLA int64 PWAPART int64 PWABEDR int64 PWALAND int64 PPERSAUT int64 PBESAUT int64 PMOTSCO int64 PVRAAUT int64 PAANHANG int64 PTRACTOR int64 PWERKT int64 PBROM int64 PLEVEN int64 PPERSONG int64 PGEZONG int64 PWAOREG int64 PBRAND int64 PZEILPL int64 PPLEZIER int64 PFIETS int64 PINBOED int64 PBYSTAND int64 AWAPART int64 AWABEDR int64 AWALAND int64 APERSAUT int64 ABESAUT int64 AMOTSCO int64 AVRAAUT int64 AAANHANG int64 ATRACTOR int64 AWERKT int64 ABROM int64 ALEVEN int64 APERSONG int64 AGEZONG int64 AWAOREG int64 ABRAND int64 AZEILPL int64 APLEZIER int64 AFIETS int64 AINBOED int64 ABYSTAND int64 CARAVAN int64 dtype: object
df_train_null_columns = df_train.isnull().sum().sort_values(ascending=False)
df_train_null_rows = df_train.isnull().sum(axis=1).sort_values(ascending=False)
print(df_train_null_columns.shape, df_train_null_rows.shape)
pd_null_columnas = pd.DataFrame(df_train_null_columns, columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(df_train_null_rows, columns=['nulos_filas'])
pd_null_filas['target'] = df_train['CARAVAN'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/df_train.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/df_train.shape[1]
(86,) (5822,)
pd_null_columnas
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| MOSTYPE | 0 | 0.0 |
| PLEVEN | 0 | 0.0 |
| PINBOED | 0 | 0.0 |
| PFIETS | 0 | 0.0 |
| PPLEZIER | 0 | 0.0 |
| PZEILPL | 0 | 0.0 |
| PBRAND | 0 | 0.0 |
| PWAOREG | 0 | 0.0 |
| PGEZONG | 0 | 0.0 |
| PPERSONG | 0 | 0.0 |
| PBROM | 0 | 0.0 |
| AWAPART | 0 | 0.0 |
| PWERKT | 0 | 0.0 |
| PTRACTOR | 0 | 0.0 |
| PAANHANG | 0 | 0.0 |
| PVRAAUT | 0 | 0.0 |
| PMOTSCO | 0 | 0.0 |
| PBESAUT | 0 | 0.0 |
| PPERSAUT | 0 | 0.0 |
| PWALAND | 0 | 0.0 |
| PBYSTAND | 0 | 0.0 |
| AWABEDR | 0 | 0.0 |
| MAANTHUI | 0 | 0.0 |
| APERSONG | 0 | 0.0 |
| ABYSTAND | 0 | 0.0 |
| AINBOED | 0 | 0.0 |
| AFIETS | 0 | 0.0 |
| APLEZIER | 0 | 0.0 |
| AZEILPL | 0 | 0.0 |
| ABRAND | 0 | 0.0 |
| AWAOREG | 0 | 0.0 |
| AGEZONG | 0 | 0.0 |
| ALEVEN | 0 | 0.0 |
| AWALAND | 0 | 0.0 |
| ABROM | 0 | 0.0 |
| AWERKT | 0 | 0.0 |
| ATRACTOR | 0 | 0.0 |
| AAANHANG | 0 | 0.0 |
| AVRAAUT | 0 | 0.0 |
| AMOTSCO | 0 | 0.0 |
| ABESAUT | 0 | 0.0 |
| APERSAUT | 0 | 0.0 |
| PWABEDR | 0 | 0.0 |
| PWAPART | 0 | 0.0 |
| MKOOPKLA | 0 | 0.0 |
| MRELOV | 0 | 0.0 |
| MBERZELF | 0 | 0.0 |
| MBERHOOG | 0 | 0.0 |
| MOPLLAAG | 0 | 0.0 |
| MOPLMIDD | 0 | 0.0 |
| MOPLHOOG | 0 | 0.0 |
| MFWEKIND | 0 | 0.0 |
| MFGEKIND | 0 | 0.0 |
| MFALLEEN | 0 | 0.0 |
| MRELSA | 0 | 0.0 |
| MINKGEM | 0 | 0.0 |
| MRELGE | 0 | 0.0 |
| MGODGE | 0 | 0.0 |
| MGODOV | 0 | 0.0 |
| MGODPR | 0 | 0.0 |
| MGODRK | 0 | 0.0 |
| MOSHOOFD | 0 | 0.0 |
| MGEMLEEF | 0 | 0.0 |
| MGEMOMV | 0 | 0.0 |
| MBERBOER | 0 | 0.0 |
| MBERMIDD | 0 | 0.0 |
| MBERARBG | 0 | 0.0 |
| MBERARBO | 0 | 0.0 |
| MINK123M | 0 | 0.0 |
| MINK7512 | 0 | 0.0 |
| MINK4575 | 0 | 0.0 |
| MINK3045 | 0 | 0.0 |
| MINKM30 | 0 | 0.0 |
| MZPART | 0 | 0.0 |
| MZFONDS | 0 | 0.0 |
| MAUT0 | 0 | 0.0 |
| MAUT2 | 0 | 0.0 |
| MAUT1 | 0 | 0.0 |
| MHKOOP | 0 | 0.0 |
| MHHUUR | 0 | 0.0 |
| MSKD | 0 | 0.0 |
| MSKC | 0 | 0.0 |
| MSKB2 | 0 | 0.0 |
| MSKB1 | 0 | 0.0 |
| MSKA | 0 | 0.0 |
| CARAVAN | 0 | 0.0 |
pd_null_filas.head()
| nulos_filas | target | porcentaje_filas | |
|---|---|---|---|
| 0 | 0 | 0 | 0.0 |
| 3888 | 0 | 0 | 0.0 |
| 3886 | 0 | 0 | 0.0 |
| 3885 | 0 | 0 | 0.0 |
| 3884 | 0 | 0 | 0.0 |
No encontramos valores nulos en el dataframe de entrenamiento.
columnas_duplicadas = duplicate_columns(df_train)
columnas_duplicadas
[]
No encontramos columnas duplicadas en el dataframe de entrenamiento.
Para poder dividir las variables categóricas y las numéricas tenemos que hacer un análisis individualizado de cada uno y atender al diccionario, ya que las variales categóricas están codificadas e indexadas por lo que no se diferencian si no las observamos de forma independiente.
Por lo tanto, las variables categóricas son 'MOSTYPE' (nº1) y las comprendidas entre 'MGEMLEEF' (nº 4) y 'PBYSTAND' (nº 63) así como la última, 'CARAVAN' (nº 86), que se trataría de la variable objetivo.
En cambio, las variables numéricas serían las restantes. Esto es, las comprendidas entre 'AWAPART' (nº 64) y 'ABYSTAND' (nº 85) además de MAANTHUI (nº 2) y 'MGEMOMV' (nº3).
Creamos dos dataframes donde dividimos las variables categoricas y las variables numericas.
df_train_categoricas = df_train.iloc[:,np.r_[0,3:64]]
df_train_numericas = df_train.iloc[:,np.r_[1:3,64:86]]
Aplicamos la fórmula con un multiplicador del 1.5 para observar posibles outliers. La razón de este multiplicador es debido a que las variables numéricas únicamente reciben un rango de valores pequeño
df_train_outliers = df_train
list_var_cat, other = dame_variables_categoricas(dataset=df_train_outliers)
df_train_outliers[list_var_cat] = df_train_outliers[list_var_cat].astype("category")
list_var_continuous = list(df_train_outliers.select_dtypes('int').columns)
df_train_outliers[list_var_continuous] = df_train_outliers[list_var_continuous].astype(int)
list_outliers_train = get_deviation_of_mean_perc(df_train_outliers, list_var_continuous, target='CARAVAN', multiplier=1.5)
list_outliers_train = list_outliers_train.loc[:,['variable', 0,1,'sum_outlier_values','porcentaje_sum_null_values']]
list_outliers_train
| variable | 0 | 1 | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | MOSTYPE | 0.909270 | 0.090730 | 507 | 0.087083 |
| 1 | MAANTHUI | 0.940541 | 0.059459 | 555 | 0.095328 |
| 2 | MGEMOMV | 0.940670 | 0.059330 | 1045 | 0.179492 |
| 3 | MGEMLEEF | 0.952862 | 0.047138 | 297 | 0.051013 |
| 4 | MOSHOOFD | 0.913043 | 0.086957 | 552 | 0.094813 |
| 5 | MGODRK | 0.961832 | 0.038168 | 262 | 0.045002 |
| 6 | MGODPR | 0.951537 | 0.048463 | 846 | 0.145311 |
| 7 | MGODOV | 0.937650 | 0.062350 | 417 | 0.071625 |
| 8 | MGODGE | 0.936468 | 0.063532 | 787 | 0.135177 |
| 9 | MRELGE | 0.977860 | 0.022140 | 542 | 0.093095 |
| 10 | MRELSA | 0.970260 | 0.029740 | 269 | 0.046204 |
| 11 | MRELOV | 0.978339 | 0.021661 | 554 | 0.095156 |
| 12 | MFALLEEN | 0.962000 | 0.038000 | 500 | 0.085881 |
| 13 | MFGEKIND | 0.927798 | 0.072202 | 831 | 0.142734 |
| 14 | MFWEKIND | 0.943847 | 0.056153 | 837 | 0.143765 |
| 15 | MOPLHOOG | 0.886707 | 0.113293 | 662 | 0.113707 |
| 16 | MOPLMIDD | 0.942460 | 0.057540 | 1008 | 0.173136 |
| 17 | MOPLLAAG | 0.926012 | 0.073988 | 865 | 0.148574 |
| 18 | MBERHOOG | 0.906716 | 0.093284 | 536 | 0.092065 |
| 19 | MBERZELF | 0.926503 | 0.073497 | 449 | 0.077121 |
| 20 | MBERBOER | 0.973770 | 0.026230 | 305 | 0.052387 |
| 21 | MBERMIDD | 0.928696 | 0.071304 | 1150 | 0.197527 |
| 22 | MBERARBG | 0.951807 | 0.048193 | 581 | 0.099794 |
| 23 | MBERARBO | 0.956679 | 0.043321 | 554 | 0.095156 |
| 24 | MSKA | 0.892183 | 0.107817 | 371 | 0.063724 |
| 25 | MSKB1 | 0.932715 | 0.067285 | 431 | 0.074030 |
| 26 | MSKB2 | 0.950855 | 0.049145 | 468 | 0.080385 |
| 27 | MSKC | 0.936856 | 0.063144 | 776 | 0.133288 |
| 28 | MSKD | 0.977716 | 0.022284 | 359 | 0.061663 |
| 29 | MHHUUR | 0.959211 | 0.040789 | 760 | 0.130539 |
| 30 | MHKOOP | 0.959211 | 0.040789 | 760 | 0.130539 |
| 31 | MAUT1 | 0.943168 | 0.056832 | 827 | 0.142047 |
| 32 | MAUT2 | 0.937330 | 0.062670 | 367 | 0.063037 |
| 33 | MAUT0 | 0.971698 | 0.028302 | 318 | 0.054620 |
| 34 | MZFONDS | 0.911552 | 0.088448 | 554 | 0.095156 |
| 35 | MZPART | 0.911712 | 0.088288 | 555 | 0.095328 |
| 36 | MINKM30 | 0.968978 | 0.031022 | 548 | 0.094126 |
| 37 | MINK3045 | 0.940881 | 0.059119 | 795 | 0.136551 |
| 38 | MINK4575 | 0.930939 | 0.069061 | 362 | 0.062178 |
| 39 | MINK7512 | 0.914761 | 0.085239 | 481 | 0.082618 |
| 40 | MINK123M | 0.943396 | 0.056604 | 159 | 0.027310 |
| 41 | MINKGEM | 0.923313 | 0.076687 | 652 | 0.111989 |
| 42 | MKOOPKLA | 0.947680 | 0.052320 | 1013 | 0.173995 |
| 43 | PWAPART | 0.818182 | 0.181818 | 11 | 0.001889 |
| 44 | PWABEDR | 0.939024 | 0.060976 | 82 | 0.014085 |
| 45 | PWALAND | 0.975000 | 0.025000 | 120 | 0.020611 |
| 46 | PPERSAUT | 1.000000 | NaN | 3 | 0.000515 |
| 47 | PBESAUT | 0.958333 | 0.041667 | 48 | 0.008245 |
| 48 | PMOTSCO | 0.927928 | 0.072072 | 222 | 0.038131 |
| 49 | PVRAAUT | 1.000000 | NaN | 9 | 0.001546 |
| 50 | PAANHANG | 0.907692 | 0.092308 | 65 | 0.011165 |
| 51 | PTRACTOR | 0.965035 | 0.034965 | 143 | 0.024562 |
| 52 | PWERKT | 1.000000 | NaN | 21 | 0.003607 |
| 53 | PBROM | 0.979798 | 0.020202 | 396 | 0.068018 |
| 54 | PLEVEN | 0.919014 | 0.080986 | 284 | 0.048780 |
| 55 | PPERSONG | 0.967742 | 0.032258 | 31 | 0.005325 |
| 56 | PGEZONG | 0.842105 | 0.157895 | 38 | 0.006527 |
| 57 | PWAOREG | 0.826087 | 0.173913 | 23 | 0.003951 |
| 58 | PBRAND | 0.964968 | 0.035032 | 314 | 0.053933 |
| 59 | PZEILPL | 0.666667 | 0.333333 | 3 | 0.000515 |
| 60 | PPLEZIER | 0.606061 | 0.393939 | 33 | 0.005668 |
| 61 | PFIETS | 0.897959 | 0.102041 | 147 | 0.025249 |
| 62 | PINBOED | 0.888889 | 0.111111 | 45 | 0.007729 |
| 63 | PBYSTAND | 0.804878 | 0.195122 | 82 | 0.014085 |
| 64 | AWAPART | 1.000000 | NaN | 6 | 0.001031 |
| 65 | AWABEDR | 0.939024 | 0.060976 | 82 | 0.014085 |
| 66 | AWALAND | 0.975000 | 0.025000 | 120 | 0.020611 |
| 67 | APERSAUT | 0.852830 | 0.147170 | 265 | 0.045517 |
| 68 | ABESAUT | 0.958333 | 0.041667 | 48 | 0.008245 |
| 69 | AMOTSCO | 0.927928 | 0.072072 | 222 | 0.038131 |
| 70 | AVRAAUT | 1.000000 | NaN | 9 | 0.001546 |
| 71 | AAANHANG | 0.907692 | 0.092308 | 65 | 0.011165 |
| 72 | ATRACTOR | 0.965035 | 0.034965 | 143 | 0.024562 |
| 73 | AWERKT | 1.000000 | NaN | 21 | 0.003607 |
| 74 | ABROM | 0.979798 | 0.020202 | 396 | 0.068018 |
| 75 | ALEVEN | 0.921502 | 0.078498 | 293 | 0.050326 |
| 76 | APERSONG | 0.967742 | 0.032258 | 31 | 0.005325 |
| 77 | AGEZONG | 0.842105 | 0.157895 | 38 | 0.006527 |
| 78 | AWAOREG | 0.826087 | 0.173913 | 23 | 0.003951 |
| 79 | ABRAND | 0.949640 | 0.050360 | 139 | 0.023875 |
| 80 | AZEILPL | 0.666667 | 0.333333 | 3 | 0.000515 |
| 81 | APLEZIER | 0.606061 | 0.393939 | 33 | 0.005668 |
| 82 | AFIETS | 0.897959 | 0.102041 | 147 | 0.025249 |
| 83 | AINBOED | 0.888889 | 0.111111 | 45 | 0.007729 |
| 84 | ABYSTAND | 0.804878 | 0.195122 | 82 | 0.014085 |
| 85 | CARAVAN | 1.000000 | NaN | 348 | 0.059773 |
El porcentaje de outliers es relativamente bajo por lo que no es necesario tomar acciones inmediatas, pero debe ser tenido en cuenta.
Realizamos la matriz de correlación de las variables numericas continuas mediante el método Pearson:
get_corr_matrix(dataset = df_train_numericas, metodo='pearson', size_figure=[10,8])
0
Como podemos observar en la matriz de correlaciones, la correlación entre variables es baja, a excepción de dos variables, AWAPART y ATRACTOR, que indican el número de seguros 'a terceros' contratados y el número de pólizas de tractor contratadas, respectivamente. A pesar de tener una correlación superior al resto, no es suficiente para poder eliminar ninguna de las dos ya que no transmiten la misma información.
A continuación, aplicamos la matriz de confusión de forma independiente a varias variables entre sí para obtener sinergias.
correlacion_numerica = df_train_numericas.corr('pearson')
new_correlacion_numerica = correlacion_numerica.abs()
new_correlacion_numerica.loc[:,:] = np.tril(new_correlacion_numerica, k=-1) # below main lower triangle of an array
new_corr = new_correlacion_numerica.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.2]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 244 | ATRACTOR | AWALAND | 0.547175 |
| 410 | ABRAND | AWAPART | 0.525261 |
| 270 | AWERKT | ABESAUT | 0.379082 |
# Hacemos un análisis de la distribución demográfica de la variables categórica con mayor diversidad de posibles valores
df_train.describe().filter(items= ['std'], axis=0)
df_train_std = df_train.describe().filter(items= ['std'], axis=0).transpose()
df_train_std[df_train_std['std']>2]
| std | |
|---|---|
| MOSTYPE | 12.846706 |
| MOSHOOFD | 2.856760 |
| MFWEKIND | 2.005283 |
| MOPLLAAG | 2.298125 |
| MHHUUR | 3.089302 |
| MHKOOP | 3.089837 |
| MINKM30 | 2.086099 |
| MKOOPKLA | 2.007150 |
| PPERSAUT | 2.920669 |
MOSTYPE, MOSHOOFD, MFWEKIND, MOPLLAAG, MHHUUR, MHKOOP, MINKM30, MKOOPKLA, PPERSAUT tienen una std superior a 2 por lo que pueden ser favorables a la hora del análisis.
for i in df_train_categoricas:
for n in df_train_categoricas:
if (i!=n):
matriz_confusion = pd.crosstab(df_train_categoricas[i], df_train_categoricas[n])
if(cramers_v(matriz_confusion.values)>0.8):
print(cramers_v(matriz_confusion.values), i, ' + ', n)
else:
pass
0.9974157937677048 MOSTYPE + MOSHOOFD 0.953719608831212 MOSTYPE + MKOOPKLA 0.9974157937677048 MOSHOOFD + MOSTYPE 0.9899162769547203 MHHUUR + MHKOOP 0.9899162769547202 MHKOOP + MHHUUR 0.9947960599531752 MZFONDS + MZPART 0.9947960599531753 MZPART + MZFONDS 0.9537196088312119 MKOOPKLA + MOSTYPE
Se puede observar una gran correlación entre las variables:
La gran correlación entre estas variables nos indica que otorgan la misma información y que con solo una de ellas sería suficiente para el análisis. Para saber que variable debería ser descartada, calculamos sus desviaciones típicas y nos quedamos con la que mayor valor devuelva:
std_mostype = df_train['MOSTYPE'].std()
std_moshoofd = df_train['MOSHOOFD'].std()
print('MOSTYPE: ',std_mostype,'\nMOSHOOFD: ',std_moshoofd)
MOSTYPE: 12.846705702054075 MOSHOOFD: 2.8567598298669026
La variable 'MOSHOOFD' tiene una menor desviación típica por lo que, en caso de decidir descartar alguna de las variables, debería ser esta.
std_mostype = df_train['MOSTYPE'].std()
std_mkoopkla = df_train['MKOOPKLA'].std()
print('MOSTYPE: ',std_mostype,'\nMKOOPKLA: ',std_mkoopkla)
MOSTYPE: 12.846705702054075 MKOOPKLA: 2.0071504808171605
La variable 'MKOOPKLA' tiene una menor desviación típica por lo que, en caso de decidir descartar alguna de las variables, debería ser esta.
std_mhhuur = df_train['MHHUUR'].std()
std_mhkoop = df_train['MHKOOP'].std()
print('MHHUUR: ',std_mhhuur,'\nMHKOOP: ',std_mhkoop)
MHHUUR: 3.0893016264941133 MHKOOP: 3.0898369969009716
La variable 'MHHUUR' tiene una menor desviación típica por lo que, en caso de decidir descartar alguna de las variables, debería ser esta.
std_mzfonds = df_train['MZFONDS'].std()
std_mzpart = df_train['MHKOOP'].std()
print('MZFONDS: ',std_mzfonds,'\nMZPART: ',std_mzpart)
MZFONDS: 1.9786749300835114 MZPART: 3.0898369969009716
La variable 'MZFONDS' tiene una menor desviación típica por lo que, en caso de decidir descartar alguna de las variables, debería ser esta.
Tras obtener los resultados, al tratarse de un análisis exploratorio no vamos a descartar ninguna variable por este motivo.
Analisis de la variable objetivo 'CARAVAN', que hace referencia al número de polizas de seguro de caravanas que posee el cliente.
# Analisis del total de valores de la variable 'CARAVAN'
df_train['CARAVAN'].value_counts()
0 5474 1 348 Name: CARAVAN, dtype: int64
df_caravan = df_train['CARAVAN'].value_counts(normalize=True).mul(100).rename('percent').reset_index()
df_caravan_conteo = df_train['CARAVAN'].value_counts().reset_index()
df_caravan_pc = pd.merge(df_caravan, df_caravan_conteo, on=['index'], how='inner')
df_caravan_pc
| index | percent | CARAVAN | |
|---|---|---|---|
| 0 | 0 | 94.022673 | 5474 |
| 1 | 1 | 5.977327 | 348 |
#fig_1 = px.histogram(df_caravan_pc, x="index", y=['percent'])
fig_1 = px.pie(df_caravan_pc,
names='index',
height=400,
width=600,
hole=0.4,
title='Valores de la variable CARAVAN',
values='percent',
color_discrete_sequence=['#2596be','#e28743']
)
fig_1.update_layout(legend=dict(orientation='h', yanchor='bottom', y=-0.2, xanchor='center', x=0.5))
fig_1.show()
variables_analizar = ['MOSHOOFD','MGEMOMV','PPLEZIER','PZEILPL','PMOTSCO','MKOOPKLA','MHKOOP','MZPART','ALEVEN','APLEZIER']
for i in list(variables_analizar):
if (df_train[i].dtype==float) & (i!='CARAVAN'):
plot_feature_mod(df_train, col_name=i, isContinuous=True, target='CARAVAN')
elif i!='CARAVAN':
plot_feature_mod(df_train, col_name=i, isContinuous=False, target='CARAVAN')
Como se observa en los gráficos, hay varias variables a tener en cuenta que deberían ser analizadas detenidamente, como 'MGEMOMV' y 'MOSHOOFD', así como ciertas variables numéricas que podrían tener una alta correlación entre ellas y podrían ser resumidas en una sola de cara al análisis.
A continuación, llevaremos a cabo una visualización de la distribución de la variable objetivo a través de varias variables categóricas absolutas (no tienen intervalos) para crearnos un perfil de cliente.
dist_var_caravan = df_train.groupby(df_train.MOSTYPE)['CARAVAN'].apply(blaus_index).mul(100).reset_index(name='CARAVAN(%)')
fig_mc = px.histogram(dist_var_caravan, x=str('MOSTYPE'), y=['CARAVAN(%)'],
nbins=41,
labels={
"MOSTYPE" : "MOSTYPE - Customer Subtype (see L0)"},
title= 'Purchased caravan policy based on Customer Subtype')
fig_mc.update_layout(bargap=0.2, yaxis_title='Percentage over total')
Como observamos en el gráfico superior, los subtipos 8 y 12, que corresponden a familias de clase media y a familias jóvenes de clase media-alta, respectivamente, lo que permite deducir que en torno al 25 por ciento de los clientes que encajan en esas características contratan un seguro de autocaravana. Por otra parte, los subtipos 14, 15, 16, 17, 18, 19, 28 y 40 no pueden ser considerados como potenciales clientes si atendemos a la muestra.
dist_var_caravan = df_train.groupby('MGEMOMV')['CARAVAN'].apply(blaus_index).mul(100).reset_index(name='CARAVAN(%)')
fig_mc = px.histogram(dist_var_caravan, x=str('MGEMOMV'), y=['CARAVAN(%)'],
nbins=len(df_train.MGEMOMV.value_counts()),
labels={
"MGEMOMV" : "MGEMOMV - Average size household 1 - 6"},
title= 'Purchased caravan policy based on average size household')
fig_mc.update_layout(bargap=0.2, yaxis_title='Percentage over total')
dist_var_caravan = df_train.groupby('MGEMLEEF')['CARAVAN'].apply(blaus_index).mul(100).reset_index(name='CARAVAN(%)')
fig_mc = px.histogram(dist_var_caravan, x=str('MGEMLEEF'), y=['CARAVAN(%)'],
nbins=len(df_train.MGEMLEEF.value_counts()),
labels={
"MGEMLEEF" : "MGEMLEEF - Average age (see L1)"},
title= 'Purchased caravan policy based on Age range')
fig_mc.update_layout(bargap=0.2, yaxis_title='Percentage over total')
dist_var_caravan = df_train.groupby('MRELGE')['CARAVAN'].apply(blaus_index).mul(100).reset_index(name='CARAVAN(%)')
fig_mc = px.histogram(dist_var_caravan, x=str('MRELGE'), y=['CARAVAN(%)'],
nbins=len(df_train.MRELGE.value_counts()),
labels={
"MRELGE" : "MRELGE - Married (see L3)"},
title= 'Purchased caravan policy based on likely to be married')
fig_mc.update_layout(bargap=0.2, yaxis_title='Percentage over total')
dist_var_caravan = df_train.groupby('MINKM30')['CARAVAN'].apply(blaus_index).mul(100).reset_index(name='CARAVAN(%)')
fig_mc = px.histogram(dist_var_caravan, x=str('MINKM30'), y=['CARAVAN(%)'],
nbins=len(df_train.MINKM30.value_counts()),
labels={
"MINKM30" : "MINKM30 - Average income(see L3)"},
title= 'Purchased caravan policy based on likely to have average income')
fig_mc.update_layout(bargap=0.2, yaxis_title='Percentage over total')
dist_var_caravan = df_train.groupby('MINKGEM')['CARAVAN'].apply(blaus_index).mul(100).reset_index(name='CARAVAN(%)')
fig_mc = px.histogram(dist_var_caravan, x=str('MINKGEM'), y=['CARAVAN(%)'],
nbins=len(df_train.MINKGEM.value_counts()),
labels={
"MINKGEM" : "MINKGEM - Average income(see L3)"},
title= 'Purchased caravan policy based on likely to have average income')
fig_mc.update_layout(bargap=0.2, yaxis_title='Percentage over total')
Como podemos observar en los gráficos, tenemos información suficiente para crearnos un perfil de cliente basándonos en la información recogida en el dataset. Si recopilamos los datos, podemos intuir que clientes que componen un perfil objetivo reunen las siguientes características:
Para finalizar, destacar la alta posibilidad de escalado del análisis.
En caso de necesidad o interés, pueden analizarse un mayor número de variables para, a través de un análisis fundamental, descartar ciertas variables o acotar más el perfil del cliente potencial.